1. Working with key-value pairs

	1.1 Map transformation

		val tranFile = sc.textFile("file:///root/TrainingOnHDP/dataset/spark/data_transactions.txt")
		val tranData = tranFile.map(_.split("#"))
		var transByCust = tranData.map(tran => (tran(2).toInt, tran))

	1.2 Getting keys and values
	
		transByCust.keys.distinct().count()

	1.3 Counting values per key
	
		transByCust.countByKey()
		transByCust.countByKey().values.sum
		val (cid, purch) = transByCust.countByKey().toSeq.sortBy(_._2).last
		var complTrans = Array(Array("2015-03-30", "11:59 PM", "53", "4", "1", "0.00"))

	1.4 Looking up values for a single key
	
		transByCust.lookup(53)
		transByCust.lookup(53).foreach(tran => println(tran.mkString(", ")))

	1.5 Using the mapValues transformation to change values in a pair RDD
	
		transByCust = transByCust.mapValues(tran => { 
				if (tran(3).toInt == 25 && tran(4).toDouble > 1) tran(5) = (tran(5).toDouble * 0.95).toString;
				tran})

	1.6 Using the flatMapValues transformation to add values to keys
	
		transByCust = transByCust.flatMapValues(tran => {
				if(tran(3).toInt == 81 && tran(4).toInt >= 5) {
					val cloned = tran.clone();
					cloned(5) = "0.00";
					cloned(3) = "70";
					cloned(4) = "1";
					List(tran, cloned)
				}else
					List(tran)
				})

	1.7 Using the reduceByKey transformation to merge all values of a key
	
	1.8 Using the foldByKey transformation as an alternative to reduceByKey
	
		val amounts = transByCust.mapValues(t => t(5).toDouble)
		val totals = amounts.foldByKey(0)((p1, p2) => p1 + p2).collect()
		totals.toSeq.sortBy(_._2).last
		amounts.foldByKey(100000)((p1, p2) => p1 + p2).collect()

		complTrans = complTrans :+ Array("2015-03-30", "11:59 PM", "76", "63", "1", "0.00")
		transByCust = transByCust.union(sc.parallelize(complTrans).map(t => (t(2).toInt, t)))
		transByCust.map(t => t._2.mkString("#")).saveAsTextFile("file:///root/TrainingOnHDP/dataset/spark/output-transByCust")

	1.9 Using aggregateByKey to group all values of a key

		val prods = transByCust.aggregateByKey(List[String]())(
		(prods, tran) => prods ::: List(tran(3)),
		(prods1, prods2) => prods1 ::: prods2)
   
		prods.collect()
		
2. Data partitioning and shuffling

	2.1 Understanding and avoiding unnecessary shuffling

		val prods = transByCust.aggregateByKey(List[String]())(
		(prods, tran) => prods ::: List(tran(3)),
		(prods1, prods2) => prods1 ::: prods2)

	2.2 Shuffle caused by partitioner removal

		import org.apache.spark.rdd.RDD
		val rdd:RDD[Int] = sc.parallelize(1 to 10000)
		rdd.map(x => (x, x*x)).map(_.swap).collect()
		rdd.map(x => (x, x*x)).reduceByKey((v1, v2)=>v1+v2).collect()

	2.3 Optimizing shuffling with an external shuffle service
	
		spark.shuffle.service.enabled = true
	
	2.4 Collecting partition data with a glom transformation
	
		val list = List.fill(500)(scala.util.Random.nextInt(100))
		val rdd = sc.parallelize(list, 30).glom()
		rdd.collect()
		rdd.count()


	2.5 Joining, sorting and grouping data

		2.5.1 Joining data		
		
			val transByProd = tranData.map(tran => (tran(3).toInt, tran))
			val totalsByProd = transByProd.mapValues(t => t(5).toDouble).reduceByKey{case(tot1, tot2) => tot1 + tot2}

			val products = sc.textFile("file:///root/TrainingOnHDP/dataset/spark/data_products.txt").map(line => line.split("#")).map(p => (p(0).toInt, p))

		2.5.2 The four classic join transformations
	
			val totalsAndProds = totalsByProd.join(products)
			totalsAndProds.first()

			val totalsWithMissingProds = products.leftOuterJoin(totalsByProd)
			val missingProds = totalsWithMissingProds.filter(x => x._2._1 == None).map(x => x._2._2)
			missingProds.foreach(p => println(p.mkString(", ")))

			val totalsWithMissingProds = totalsByProd.rightOuterJoin(products)
			val missingProds = totalsWithMissingProds.filter(x => x._2._1 == None).map(x => x._2._2)
			missingProds.foreach(p => println(p.mkString(", ")))

		2.5.3 Using substract and subtractByKey transformations to remove common values
		
			val missingProds = products.subtractByKey(totalsByProd).values
			missingProds.foreach(p => println(p.mkString(", ")))

		2.5.4 Joining RDDs with the cogroup transformation
	
			val prodTotCogroup = totalsByProd.cogroup(products)
			prodTotCogroup.filter(x => x._2._1.isEmpty).foreach(x => println(x._2._2.head.mkString(", ")))
			val totalsAndProds = prodTotCogroup.filter(x => !x._2._1.isEmpty).map(x => (x._2._2.head(0).toInt,(x._2._1.head, x._2._2.head)))

		2.5.5 Using the intersection transformation	
		
			totalsByProd.map(_._1).intersection(products.map(_._1))

		2.5.6 Combining two RDDs with the cartesian transformation
	
			val rdd1 = sc.parallelize(List(7,8,9))
			val rdd2 = sc.parallelize(List(1,2,3))
			rdd1.cartesian(rdd2).collect()
			rdd1.cartesian(rdd2).filter(el => el._1 % el._2 == 0).collect()

		2.5.7 Joining RDDs with the zip transformation	
		
			val rdd1 = sc.parallelize(List(1,2,3))
			val rdd2 = sc.parallelize(List("n4","n5","n6"))
			rdd1.zip(rdd2).collect()

		2.5.8 Joining RDDs with the zipPartitions transformation	
		
			val rdd1 = sc.parallelize(1 to 10, 10)
			val rdd2 = sc.parallelize((1 to 8).map(x=>"n"+x), 10)
			rdd1.zipPartitions(rdd2, true)((iter1, iter2) => {
				iter1.zipAll(iter2, -1, "empty")
				.map({case(x1, x2)=>x1+"-"+x2})}).collect()

		2.5.9 Sorting data

			val sortedProds = totalsAndProds.sortBy(_._2._2(1))
			sortedProds.collect()

			case class Employee(lastName: String) extends Ordered[Employee] {
				override def compare(that: Employee) = this.lastName.compare(that.lastName)
			}

			implicit val emplOrdering = new Ordering[Employee] {
				override def compare(a: Employee, b: Employee) = a.lastName.compare(b.lastName)
			}

			implicit val emplOrdering: Ordering[Employee] = Ordering.by(_.lastName)

		2.5.10 Grouping data
		
			2.5.10.1 Grouping data with the combineByKey transformation
		
				def createComb = (t:Array[String]) => {
					val total = t(5).toDouble;
					val q = t(4).toInt;
					(total/q, total/q, q, total) }
  
				def mergeVal:((Double,Double,Int,Double),Array[String])=>(Double,Double,Int,Double) =
					{ case((mn,mx,c,tot),t) => {
						val total = t(5).toDouble;
						val q = t(4).toInt;
						(scala.math.min(mn,total/q),scala.math.max(mx,total/q),c+q,tot+total) } }

				def mergeComb:((Double,Double,Int,Double),(Double,Double,Int,Double))=>(Double,Double,Int,Double) =
					{case((mn1,mx1,c1,tot1),(mn2,mx2,c2,tot2)) =>
						(scala.math.min(mn1,mn2),scala.math.max(mx1,mx2),c1+c2,tot1+tot2) }

				val avgByCust = transByCust.combineByKey(createComb, mergeVal, mergeComb,
						new org.apache.spark.HashPartitioner(transByCust.partitions.size)).mapValues({case(mn,mx,cnt,tot) => (mn,mx,cnt,tot,tot/cnt)})
		 
				avgByCust.first()

				totalsAndProds.map(_._2).map(x=>x._2.mkString("#")+", "+x._1).saveAsTextFile("file:///root/TrainingOnHDP/dataset/spark/output-totalsPerProd")

				avgByCust.map{ case (id, (min, max, cnt, tot, avg)) => "%d#%.2f#%.2f#%d#%.2f#%.2f".format(id, min, max, cnt, tot, avg)}.saveAsTextFile("file:///root/TrainingOnHDP/dataset/spark/output-avgByCust")

3. Understanding RDD dependencies

	3.1 RDD dependencies and Spark execution
	
		val list = List.fill(500)(scala.util.Random.nextInt(10))
		val listrdd = sc.parallelize(list, 5)
		val pairs = listrdd.map(x => (x, x*x))
		val reduced = pairs.reduceByKey((v1, v2)=>v1+v2)
		val finalrdd = reduced.mapPartitions(iter => iter.map({case(k,v)=>"K="+k+",V="+v}))
		finalrdd.collect()
		println(finalrdd.toDebugString)

4. Using accumulators and broadcast variables to communicate with Spark executors

	4.1 Obtaining data from executors with accumulators
		
		val acc = sc.accumulator(0, "acc name")
		val list = sc.parallelize(1 to 1000000)
		list.foreach(x => acc.add(1))
		acc.value
		list.foreach(x => acc.value)

	4.2 Writing custom accumulators
		
		val rdd = sc.parallelize(1 to 100)
		import org.apache.spark.AccumulableParam
		implicit object AvgAccParam extends AccumulableParam[(Int, Int), Int] {
				def zero(v:(Int, Int)) = (0, 0);
				def addInPlace(v1:(Int, Int), v2:(Int, Int)) = (v1._1+v2._1, v1._2+v2._2);
				def addAccumulator(v1:(Int, Int), v2:Int) = (v1._1+1, v1._2+v2);
		}
		val acc = sc.accumulable((0,0))
		rdd.foreach(x => acc += x)
		val mean = acc.value._2.toDouble / acc.value._1

	4.3 Accumulating values in accumulable collections
		
		import scala.collection.mutable.MutableList
		val colacc = sc.accumulableCollection(MutableList[Int]())
		rdd.foreach(x => colacc += x)
		colacc.value

5. Finding the maximum salaries in a given department

	val deptEmployees = List(
      ("cs",("jack",1000.0)),
      ("cs",("bron",1200.0)),
      ("phy",("sam",2200.0)),
      ("phy",("ronaldo",500.0))
	)

	val employeeRDD = sc.makeRDD(deptEmployees)

	val maxByDept = employeeRDD.foldByKey(("dummy",0.0))((acc,element)=> if(acc._2 > element._2) acc else element)
  
	println("maximum salaries in each dept" + maxByDept.collect().toList)
  

6. Finding the employee with maximum salary

	val employeeData = List(("Jack",1000.0),("Bob",2000.0),("Carl",7000.0))
	val employeeRDD = sc.makeRDD(employeeData)
	val dummyEmployee = ("dummy",0.0) 
	val maxSalaryEmployee = employeeRDD.fold(dummyEmployee)((acc,employee) => { if(acc._2 < employee._2) employee else acc})
	println("employee with maximum salary is"+maxSalaryEmployee)
  
  
7. Collecting unique values per key

	val keysWithValuesList = Array("foo=A", "foo=A", "foo=A", "foo=A", "foo=B", "bar=C", "bar=D", "bar=D")
	val data = sc.parallelize(keysWithValuesList)
	val kv = data.map(_.split("=")).map(v => (v(0), v(1))).cache()
	val initialSet = scala.collection.mutable.HashSet.empty[String]
	val addToSet = (s: scala.collection.mutable.HashSet[String], v: String) => s += v
	val mergePartitionSets = (p1: scala.collection.mutable.HashSet[String], p2: scala.collection.mutable.HashSet[String]) => p1 ++= p2
	val uniqueByKey = kv.aggregateByKey(initialSet)(addToSet, mergePartitionSets)
	uniqueByKey.collect


8. Calculating the sum of values by key

	val keysWithValuesList = Array("foo=A", "foo=A", "foo=A", "foo=A", "foo=B", "bar=C", "bar=D", "bar=D")
	val data = sc.parallelize(keysWithValuesList)
	val kv = data.map(_.split("=")).map(v => (v(0), v(1))).cache()
	val initialCount = 0
	val addToCounts = (n: Int, v: String) => n + 1
	val sumPartitionCounts = (p1: Int, p2: Int) => p1 + p2
	val countByKey = kv.aggregateByKey(initialCount)(addToCounts, sumPartitionCounts)
	countByKey.collect

9. Finding maximum in each partition and compare maximum value between partitions to get the final max value

	val dataList = List(50.0,40.0,40.0,70.0)   
	val dataRDD = sc.makeRDD(dataList)  
	val maxValue =  dataRDD.reduce (_ max _)

	val maxValue = dataRDD.glom().map((value:Array[Double]) => value.max).reduce(_ max _)


10. Using glom for calculating weighted matrix

	Using the following to launch spark shell:
		spark-shell  --packages org.jblas:jblas:1.2.4

	
	import org.jblas.DoubleMatrix
	val rowsList = List[List[Double]](
		List(50.0,40.0,44.0),
		List(88,44.0,44.0),
		List(855,0,55.0,44.0),
		List(855,0,55.0,70.0)
    )
	val weights = List(1.0,0.5,3)
	val rowRDD = sc.makeRDD(rowsList)
	val result = rowRDD.glom().map( value =>{
		val doubleMatrix = new DoubleMatrix( value.map(value => value.toArray));
		val weightMatrix = new DoubleMatrix(1, weights.length,weights.toArray:_*);
		doubleMatrix.mmul( weightMatrix.transpose())
	})


11. Using combinByKey to calculate the average

	import org.apache.spark.HashPartitioner
	case class ScoreDetail(studentName: String, subjct: String, score: Float)

	val scores = List(ScoreDetail("A", "Math", 98),
					ScoreDetail("A", "English", 88),
					ScoreDetail("B", "Math", 75),
					ScoreDetail("B", "English", 78),
					ScoreDetail("C", "Math", 90),
					ScoreDetail("C", "English", 80),
					ScoreDetail("D", "Math", 91),
					ScoreDetail("D", "English", 80))

	val scoresWithKey = for { i <- scores} yield (i.studentName, i)
	val scoresWithKeyRDD = sc.parallelize(scoresWithKey).partitionBy(new HashPartitioner(3)).cache

	scoresWithKeyRDD.foreachPartition(partition => println(partition.length))
	scoresWithKeyRDD.foreachPartition(partition => partition.foreach(item=>println(item._2)))

	val avgScoresRDD = scoresWithKeyRDD.combineByKey(
				(x:ScoreDetail)=>(x.score,1),
				(acc:(Float,Int), x:ScoreDetail)=>(acc._1 + x.score, acc._2+1),
				(acc1: (Float, Int), acc2: (Float, Int))=>(acc1._1 + acc2._1, acc1._2 + acc2._2)).map({
						case(key, value)=>(key, value._1/value._2)})

	avgScoresRDD.collect.foreach(println)

12. Return a new RDD with the specified number of partitions, placing original items into the partition returned by a user supplied function

	import org.apache.spark.Partitioner
	val x = sc.parallelize(Array(('J',"James"),('F',"Fred"),('A',"Anna"),('J',"John")), 3)
	val y = x.partitionBy(new Partitioner() {
			val numPartitions = 2;
			def getPartition(k:Any) = {
				if (k.asInstanceOf[Char] < 'H') 0 else 1
			}
	})
	val yOut = y.glom().collect()

13. Create a Pair RDD, forming one pair for each item in the original RDD. The pairs key is calculated from the value via a user-supplied function.

	val x = sc.parallelize(Array("John", "Fred", "Anna", "James"))
	val y = x.keyBy(w => w.charAt(0))
	println(y.collect().mkString(", "))
	
14. Return a new RDD containing a statistical sample of the original RDD

	val x = sc.parallelize(Array(1, 2, 3, 4, 5))
	val y = x.sample(false, 0.4)
	println(y.collect().mkString(", "))	
	
15. Return a new RDD by applying a function to each partition of this RDD

	val x = sc.parallelize(Array(1,2,3), 2)
	def f(i:Iterator[Int])={ (i.sum,50).productIterator }
	val y = x.mapPartitions(f)
	val xOut = x.glom().collect()
	val Out = y.glom().collect()
	
16. Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition

	val x = sc.parallelize(Array(1,2,3), 2)
	def f(partitionIndex:Int, i:Iterator[Int]) = {(partitionIndex, i.sum).productIterator}
	val y = x.mapPartitionsWithIndex(f)
	val xOut = x.glom().collect()
	val yOut = y.glom().collect()
	
  
  
